{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Databricks Vector Search\n",
    "\n",
    "Databricks Vector Search is a vector database that is built into the Databricks Intelligence Platform and integrated with its governance and productivity tools. Full docs here: https://docs.databricks.com/en/generative-ai/vector-search.html"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Install llama-index and databricks-vectorsearch. You must be inside a Databricks runtime to use the Vector Search python client."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%pip install llama-index llama-index-vector-stores-databricks\n",
    "%pip install databricks-vectorsearch"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Import databricks dependencies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from databricks.vector_search.client import (\n",
    "    VectorSearchIndex,\n",
    "    VectorSearchClient,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Import LlamaIndex dependencies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.core import (\n",
    "    VectorStoreIndex,\n",
    "    SimpleDirectoryReader,\n",
    "    ServiceContext,\n",
    "    StorageContext,\n",
    ")\n",
    "from llama_index.vector_stores.databricks import DatabricksVectorSearch"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Load example data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!mkdir -p 'data/paul_graham/'\n",
    "!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Read the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load documents\n",
    "documents = SimpleDirectoryReader(\"./data/paul_graham/\").load_data()\n",
    "print(f\"Total documents: {len(documents)}\")\n",
    "print(f\"First document, id: {documents[0].doc_id}\")\n",
    "print(f\"First document, hash: {documents[0].hash}\")\n",
    "print(\n",
    "    \"First document, text\"\n",
    "    f\" ({len(documents[0].text)} characters):\\n{'='*20}\\n{documents[0].text[:360]} ...\"\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create a Databricks Vector Search endpoint which will serve the index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a vector search endpoint\n",
    "client = VectorSearchClient()\n",
    "client.create_endpoint(\n",
    "    name=\"llamaindex_dbx_vector_store_test_endpoint\", endpoint_type=\"STANDARD\"\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create the Databricks Vector Search index, and build it from the documents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a vector search index\n",
    "# it must be placed inside a Unity Catalog-enabled schema\n",
    "\n",
    "# We'll use self-managed embeddings (i.e. managed by LlamaIndex) rather than a Databricks-managed index\n",
    "databricks_index = client.create_direct_access_index(\n",
    "    endpoint_name=\"llamaindex_dbx_vector_store_test_endpoint\",\n",
    "    index_name=\"my_catalog.my_schema.my_test_table\",\n",
    "    primary_key=\"my_primary_key_name\",\n",
    "    embedding_dimension=1536,  # match the embeddings model dimension you're going to use\n",
    "    embedding_vector_column=\"my_embedding_vector_column_name\",  # you name this anything you want - it'll be picked up by the LlamaIndex class\n",
    "    schema={\n",
    "        \"my_primary_key_name\": \"string\",\n",
    "        \"my_embedding_vector_column_name\": \"array<double>\",\n",
    "        \"text\": \"string\",  # one column must match the text_column in the DatabricksVectorSearch instance created below; this will hold the raw node text,\n",
    "        \"doc_id\": \"string\",  # one column must contain the reference document ID (this will be populated by LlamaIndex automatically)\n",
    "        # add any other metadata you may have in your nodes (Databricks Vector Search supports metadata filtering)\n",
    "        # NOTE THAT THESE FIELDS MUST BE ADDED EXPLICITLY TO BE USED FOR METADATA FILTERING\n",
    "    },\n",
    ")\n",
    "\n",
    "databricks_vector_store = DatabricksVectorSearch(\n",
    "    index=databricks_index,\n",
    "    text_column=\"text\",\n",
    "    columns=None,  # YOU MUST ALSO RECORD YOUR METADATA FIELD NAMES HERE\n",
    ")  # text_column is required for self-managed embeddings\n",
    "storage_context = StorageContext.from_defaults(\n",
    "    vector_store=databricks_vector_store\n",
    ")\n",
    "index = VectorStoreIndex.from_documents(\n",
    "    documents, storage_context=storage_context\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Query the index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "query_engine = index.as_query_engine()\n",
    "response = query_engine.query(\"Why did the author choose to work on AI?\")\n",
    "\n",
    "print(response.response)"
   ]
  }
 ],
 "metadata": {
  "application/vnd.databricks.v1+notebook": {
   "dashboards": [],
   "language": "python",
   "notebookMetadata": {
    "pythonIndentUnit": 4
   },
   "notebookName": "Databricks Vector Search Demo (LlamaIndex Integration)",
   "widgets": {}
  },
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
